This notebook is to explore Canadian SARS-CoV-2 genomic and epidemiological data, for discussion with pillar 6’s team and for sharing with collaborators. These analyses can spur further research within or across pillars, be used for reports (or data dashboards), given to the science communication pillar for public dissemination, and the code can be repackaged to give to public health authorities for their internal use.
Canadian genomic and epidemiological data will be regularly pulled from various sources to keep these analyses up-to-date.
## 1. LOAD processed metadata of Canadian sequences (with latest pangolin, division, and full seq IDs)
#Download metadata from gisaid, put the date here:
gisaiddate="2021_12_10"
#date=2021_12_10
#tar -axf metadata_tsv_$date.tar.xz metadata.tsv -O | tr ' ' '_' | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR==1 || substr($1,9,6)=="Canada" && $8=="Human"' | sort -k3,3 > metadata_CANall_$date.uncorrected.csv
#cat metadata_virrusseq_2021_12_08.tsv | tr ' ' '_' | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR!=1 && $43!="NA"' | cut -f5,43 | sort -k2,2 > epidatesfromvirrusseq
#join -1 3 metadata_CANall_2021_12_10.uncorrected.csv -a 1 -2 2 epidatesfromvirrusseq | awk '$4!=$23 && length($4)<10 && length($23)==10{$4=$23} {id=$1;$1=$2;$2=$3;$3=id} {print}' | tr ' ' '\t' > metadata_CANall_2021_12_10.csv
#zip CoVaRRNet_pillar6notebook/data_needed/metadata_CANall_$date.zip metadata_CANall_$date.csv
metaCANall <- read.csv(unz(paste("./data_needed/metadata_CANall_",gisaiddate,".zip",sep=""),paste("metadata_CANall_",gisaiddate,".csv",sep="")),sep="\t")
metaCANall$Collection_date <- as.Date(metaCANall$Collection_date)
#max(metaCANall$Collection.date) - min(metaCANall$Collection.date) #time diff: 580 days
#make a pango.group column
metaCANall$pango.group <- metaCANall$Variant
metaCANall$pango.group[is.na(metaCANall$pango.group)]<-"other"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Alpha_202012/01_GRY_(B.1.1.7+Q.x)_first_detected_in_the_UK"]<-"Alpha"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Beta_GH/501Y.V2_(B.1.351+B.1.351.2+B.1.351.3)_first_detected_in_South_Africa" ]<-"Beta"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Gamma_GR/501Y.V3_(P.1+P.1.x)_first_detected_in_Brazil/Japan" ]<-"Gamma"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Delta_GK/478K.V1_(B.1.617.2+AY.x)_first_detected_in_India" ]<-"Delta"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.25" ]<-"Delta AY.25"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.27" ]<-"Delta AY.27"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Lambda_GR/452Q.V1_(C.37+C.37.1)_first_detected_in_Peru" ]<-"Lambda"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Mu_GH_(B.1.621+B.1.621.1)_first_detected_in_Colombia" ]<-"Mu"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Mu_GH_(B.1.621+B.1.621.1)_first_detected_in_Colombia" ]<-"Mu"
metaCANall$pango.group[str_detect(metaCANall$pango.group,"V") ]<-"other"
metaCANall$pango.group[metaCANall$Pango_lineage == "A.23.1" ]<-"A.23.1"
metaCANall$pango.group[metaCANall$Pango_lineage == "B.1.438.1" ]<-"B.1.438.1"
## 2. LOAD epidemiological data (PHAC)
epidataCANall <- read.csv(url("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv"))
#from: https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html?stat=num&measure=total&map=pt#a2
epidate = tail(epidataCANall,1)$date
#epidataAlberta <- epidataCANall[ which(epidataCANall$prnameFR=='Alberta'), ]
#print(epidataAlberta$numtoday)
Note that in this demo almost all of Ontario’s sequences were not included due to the lack of complete dates in GISAID. Moving forward, using VirusSeq Portal data, we should have better coverage in Ontario.
Sequence counts for all Canadian data in GISAID, up to 2021-12-14, shows that by end of summer Alpha and Gamma were still the most sequenced variants. Note that some of the “other” category include some Delta sublineages (AYs) but overall, from the beginning of the pandemic to the fall of 2021, Canadian sequences were mostly of the wildtype lineages (pre-VOCs).
# --- Histogram plot: counts per variant of all canadian data -------------
listpango <- unique(metaCANall$pango.group)
print(listpango)
## [1] "other" "Alpha" "B.1.438.1" "Beta" "A.23.1"
## [6] "Gamma" "Delta" "Lambda" "Delta AY.27" "Delta AY.25"
## [11] "Mu"
listpango <- listpango[order(listpango)]
getpal <- colorRampPalette(brewer.pal(9, "Paired")) #"Set3
pal <- getpal(length(listpango))
names(pal) <- listpango
#------------- counts over time (by week)
plot_metadatadf <- function(meta_tab,pal) {
meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
dfcount <- df1 %>% group_by(week) %>% count(pango.group)
#plot
ggplot(dfcount, aes(x=as.Date(week), y= n, fill=pango.group))+geom_bar(stat = "identity")+
scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
scale_x_date(date_breaks = "28 day")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
}
plot_metadatadf_freq <- function(meta_tab,pal) {
meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
dfcount <- df1 %>% group_by(week) %>% count(pango.group)
dfcount %>% mutate(freq = prop.table(n)) -> dfprop
#plot
ggplot(dfprop, aes(x= as.Date(week), y=freq, fill=pango.group))+geom_bar(stat = "identity")+
scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
scale_x_date(date_breaks = "28 day")+
theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
}
plot_metadatadf(metaCANall,pal)
plot_metadatadf_freq(metaCANall,pal)
There are two PANGO lineages that have a Canadian origin and have predominately spread within Canada (with some exportations internationally): B.1.438.1 and B.1.1.176.
Other lineages of Canadian interest:
Here we show the same plots but for each provinces
loc=sapply(strsplit(metaCANall$Location,"_/_"), `[`, 3)
metaCANall$province <- loc
print(unique(metaCANall$province))
## [1] "New_Brunswick" "Manitoba"
## [3] "Nova_Scotia" "Newfoundland_and_Labrador"
## [5] "Ontario" "British_Columbia"
## [7] "Quebec" "Alberta"
## [9] NA "Saskatchewan"
## [11] "Newfoundland"
plot_metadatadf(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)
plot_metadatadf(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)
plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)
subtab=metaCANall[ which(metaCANall$province=='Newfoundland' | metaCANall$province=='Newfoundland_and_Labrador' ), ]
plot_metadatadf(subtab,pal)
plot_metadatadf_freq(subtab,pal)
## Canadian trees
Down-sampled Canadian SARS-CoV-2 genomes. Taken from GISAID Sept. 12th, 2021. Alignment GISAID, ML tree in IQ-tree. This tree was generated using TreeTime and visualized in ggtree.
This tree shows several features of VOC spread in Canada:
These last two points are suggestive of strain (variant) replacement, i.e. competitive exclusion, but more detailed analyses would be required to clarify this.
Divergence tree from TreeTime run, visualized in ggtree.
There are various methods to investigate changes in evolutionary rates of VOC/VOIs and compare their relative fitness in an epidemiological context.
For example, root-to-tip plots give estimates of substitution rates. Clusters with stronger positive slopes than the average SARS-CoV-2 dataset, are an indication that they are accumlating mutations at a faster pace, or clusters that jump up above the average could inidcate a mutational jump (simlar to what we saw with Alpha when it first appeared in the UK).
ML IQ-tree run in Tempest for root-to-tip data. Linear regression and plotting in R.
Overall, the evolutionary rate of the Canadian data (0.000875 subs/site/year, grey line) is very close to the global average of 24.162 subs/year (= 0.000822 subs/site/year). Delta, Gamma, and Alpha are all slower than this global average.
Add here:
Are there any BEAST analyses we’d like to do? e.g. infer R0, serial interval, etc for the different Delta sublineages (might have to restrict this geographically to specific PTs with enough coverage)
Collect a list of bioinformatics, phylogenetic, and modelling tools that are useful for SARS-CoV-2 analyses:
The version numbers of all packages in the current environment as well as information about the R install is reported below.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.6.2 viridisLite_0.4.0 colorspace_2.0-2 RColorBrewer_1.1-2
## [5] kableExtra_1.3.4 gridExtra_2.3 ggbeeswarm_0.6.0 DT_0.20
## [9] cowplot_1.1.1 ggtree_3.2.1 phytools_0.7-90 maps_3.4.0
## [13] phangorn_2.8.0 tidytree_0.3.6 phylotools_0.2.2 ape_5.5
## [17] treeio_1.18.1 lubridate_1.8.0 reticulate_1.22 knitr_1.36
## [21] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [25] readr_2.1.1 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [29] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] ellipsis_0.3.2 fs_1.5.2 aplot_0.1.1
## [4] rstudioapi_0.13 farver_2.1.0 fansi_0.5.0
## [7] xml2_1.3.3 codetools_0.2-18 mnormt_2.0.2
## [10] jsonlite_1.7.2 broom_0.7.10 dbplyr_2.1.1
## [13] png_0.1-7 compiler_4.1.2 httr_1.4.2
## [16] backports_1.4.0 assertthat_0.2.1 Matrix_1.4-0
## [19] fastmap_1.1.0 lazyeval_0.2.2 cli_3.1.0
## [22] htmltools_0.5.2 tools_4.1.2 igraph_1.2.9
## [25] coda_0.19-4 gtable_0.3.0 glue_1.5.1
## [28] clusterGeneration_1.3.7 fastmatch_1.1-3 Rcpp_1.0.7
## [31] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [34] svglite_2.0.0 nlme_3.1-153 xfun_0.28
## [37] rvest_1.0.2 lifecycle_1.0.1 MASS_7.3-54
## [40] scales_1.1.1 hms_1.1.1 parallel_4.1.2
## [43] expm_0.999-6 yaml_2.2.1 ggfun_0.0.4
## [46] yulab.utils_0.0.4 stringi_1.7.6 highr_0.9
## [49] plotrix_3.8-2 rlang_0.4.12 pkgconfig_2.0.3
## [52] systemfonts_1.0.3 evaluate_0.14 lattice_0.20-45
## [55] labeling_0.4.2 patchwork_1.1.1 htmlwidgets_1.5.4
## [58] tidyselect_1.1.1 magrittr_2.0.1 R6_2.5.1
## [61] generics_0.1.1 combinat_0.0-8 DBI_1.1.1
## [64] pillar_1.6.4 haven_2.4.3 withr_2.4.3
## [67] scatterplot3d_0.3-41 modelr_0.1.8 crayon_1.4.2
## [70] utf8_1.2.2 tmvnsim_1.0-2 tzdb_0.2.0
## [73] rmarkdown_2.11 grid_4.1.2 readxl_1.3.1
## [76] reprex_2.0.1 digest_0.6.29 webshot_0.5.2
## [79] numDeriv_2016.8-1.1 gridGraphics_0.5-1 munsell_0.5.0
## [82] beeswarm_0.4.0 ggplotify_0.1.0 vipor_0.4.5
## [85] quadprog_1.5-8